library(xgboost)
library(randomForest)
library(tidyverse)
library(lubridate)

source('functions.r')
load("Table_construction.Rdata")
features = features %>%
  # Add other useful information:
  inner_join(
    data_before %>% 
      select(person_id, screening_date, people) %>%
      unnest() %>%
      select(person_id, screening_date, race, sex, name),
    by = c("person_id","screening_date")
  ) %>%
  inner_join(features_on, by = c("person_id","screening_date")) %>%
  inner_join(outcomes, by = c("person_id","screening_date")) %>%
  
  # Create as many features as possible:
  mutate(
    raw_score = `Risk of Violence_raw_score`, # Adjust for violent/general
    decile_score = `Risk of Violence_decile_score`, # Adjust for violent/general
    p_juv_fel_count = pmin(p_juv_fel_count,2),
    p_felprop_violarrest = pmin(p_felprop_violarrest,5),
    p_murder_arrest = pmin(p_murder_arrest,3),
    p_felassault_arrest = pmin(p_felassault_arrest,3),
    p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
    #p_famviol_arrest = pmin(p_famviol_arrest,3),
    p_sex_arrest = pmin(p_sex_arrest,3),
    p_weapons_arrest = pmin(p_weapons_arrest,3),
    p_n_on_probation = pmin(p_n_on_probation,5),
    p_current_on_probation = pmin(p_current_on_probation,5),
    p_prob_revoke = pmin(p_prob_revoke,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    
    # Subscales:
    vio_hist = p_juv_fel_count+
      p_felprop_violarrest+
      p_murder_arrest+
      p_felassault_arrest+
      p_misdemassault_arrest+
      #p_famviol_arrest+
      p_sex_arrest+
      p_weapons_arrest,
    history_noncomp=p_prob_revoke+
      p_probation+p_current_on_probation+
      p_n_on_probation,
    
    # Filters (TRUE for obserations to keep)
    filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1, # Filter 1
    filt3 = !is.na(current_offense_date), # Filter 3
    filt4 = current_offense_date <= current_offense_date_limit, # Filter 4
    filt5 = p_current_age > 18 & p_current_age <= 70 # Filter 5
  )

Fit age polynomial

features_f_age = features %>%
  filter(filt1,filt5) %>%
  select(p_current_age, raw_score)

lb_age = features_f_age %>%
  group_by(p_current_age) %>%
  #arrange(raw_score, .by_group=TRUE) %>%
  arrange(raw_score) %>%
  top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value

mdl_age = lm(raw_score ~ 
               I(p_current_age^4) + 
               I(p_current_age^3) + 
               I(p_current_age^2) + 
               p_current_age, 
             data=lb_age)

# More precision for paper
summary(mdl_age)
## 
## Call:
## lm(formula = raw_score ~ I(p_current_age^4) + I(p_current_age^3) + 
##     I(p_current_age^2) + p_current_age, data = lb_age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57815 -0.00957  0.00722  0.03974  0.13353 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.554e+00  8.206e-01   1.894 0.061779 .  
## I(p_current_age^4)  4.078e-07  2.904e-07   1.404 0.164074    
## I(p_current_age^3) -8.818e-05  5.109e-05  -1.726 0.088148 .  
## I(p_current_age^2)  7.437e-03  3.227e-03   2.304 0.023731 *  
## p_current_age      -3.161e-01  8.631e-02  -3.662 0.000441 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09085 on 82 degrees of freedom
## Multiple R-squared:  0.9799, Adjusted R-squared:  0.979 
## F-statistic:  1001 on 4 and 82 DF,  p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "1.55398886995612506290e+00"  "4.07808994844784392896e-07" 
## [3] "-8.81767707692677913051e-05" "7.43693932415081908338e-03" 
## [5] "-3.16106532728162026302e-01"
## Add f(age) to features
features = features %>%
  mutate(
    f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__f_age = raw_score - f_age,
    filt6 = raw_score >= f_age
  )
## Age polynomial plot
xmin = 18
xmax = 70
xx = seq(xmin,xmax, length.out=1000)

ggplot()+
  geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_f_age) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("Violent score") +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="none")

ggsave("Figures/age_LB_violent.pdf",width = 3.5, height = 2.5, units = "in")

Fit history of violence lower bound

### Compute lower bound of raw_score__f_age for each vio_hist value:

# Apply filters:
features_vio_hist = features %>%
  filter(filt1,filt3) %>% # Careful, filter 6 applied later since we need these points for plot
  select(vio_hist, raw_score__f_age,filt6)

# Compute lower bound
lb_vio_hist = features_vio_hist %>%
  filter(filt6) %>% # Now apply filter 6
  select(-filt6) %>%
  group_by(vio_hist)%>%
  top_n(n=-1,wt=raw_score__f_age)%>%
  rename(g_vio_hist = raw_score__f_age) %>%
  #arrange(vio_hist,.by_group=TRUE) %>%
  arrange(vio_hist) %>%
  ungroup()

# Use last value of g_vio_hist if vio_hist > vio_hist_cutoff
vio_hist_cutoff = 6
lb_vio_hist_cutoff = lb_vio_hist$g_vio_hist[lb_vio_hist$vio_hist==vio_hist_cutoff]

lb_vio_hist = lb_vio_hist %>%
  mutate(g_vio_hist = if_else(vio_hist < vio_hist_cutoff, g_vio_hist, lb_vio_hist_cutoff))

# Add g(vio_hist) to features
features = features %>%
  left_join(lb_vio_hist, by="vio_hist") %>%
  mutate(raw_score__f_age__g_vio_hist = raw_score__f_age - g_vio_hist)
lb_vio_hist_plot = 
  data.frame(vio_hist_xx = seq(0,13,length.out=10000)) %>%
  mutate(vio_hist = round(vio_hist_xx)) %>%
  left_join(lb_vio_hist,by="vio_hist")

ggplot() +
  geom_point(data=features_vio_hist,aes(x=vio_hist,y=raw_score__f_age,color=filt6),alpha=.5) +
  geom_line(data=lb_vio_hist_plot,aes(x=vio_hist_xx,y=g_vio_hist,color="lb"))+
  theme_bw()+
  xlab("Sum of History of Violence Components") +
  ylab(expression(Violent~score~-~f[viol~age])) + 
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") +
  scale_color_manual(name=element_blank(),
                        breaks=c("TRUE", "FALSE","lb"),
                        labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age]),expression(g[viol~hist])),
                     values=c("TRUE"="#619CFF","FALSE"="#00BA38","lb"="#F8766D"))

rm(lb_vio_hist_plot)
ggsave("Figures/vio_hist_LB_violent.pdf",width = 5, height = 3, units = "in")

Examine history of noncompliance lower bound (none found)

features %>%
  filter(filt1,filt3) %>% 
  select(history_noncomp, raw_score__f_age__g_vio_hist, filt6) %>%
  ggplot() +
  geom_point(aes(x=history_noncomp,y=raw_score__f_age__g_vio_hist,color=filt6),alpha=.5) +
  theme_bw()+
  xlab("Sum of History of Noncompliance Components") +
  ylab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist]))  +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") +
  scale_color_manual(name=element_blank(),
                       breaks=c("TRUE", "FALSE"),
                       labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age])),
                       values=c("TRUE"="#619CFF","FALSE"="#00BA38"))

ggsave("Figures/hist_noncomp_LB_violent.pdf",width = 5, height = 3, units = "in")

Predictions of (raw - age polynomial) using several ML methods

There are a few groups of predictors we will use: * Group 1: without using age variables or race * Group 2: without using age variables but with race * Group 3: without using race but with age variables * Group 4: using age variables and race

#### Generic stuff (applies to all models)

## Filter rows
features_filt = features %>%
  filter(filt1, filt3) 

## Set parameters (each combination will be run)
param <- list(objective = "reg:linear",
              eval_metric = "rmse",
              eta = c(.05,.1),
              gamma = c(.5, 1), 
              max_depth = c(2,5),
              min_child_weight = c(5,10),
              subsample = c(1),
              colsample_bytree = c(1)
)

# svm
param_svm = list(
  type = 'eps-regression',
  cost = c(0.5,1,2),
  epsilon = c(0.5,1,1.5),
  gamma_scale = c(0.5,1,2)
)

res_rmse = data.frame(Group = 1:4, lm = NA, xgb = NA, rf = NA, svm = NA)

Group 1 models: predicting (raw - age polynomial) without using age variables or race

### Create group 1 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score__f_age)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
  "label" = train %>% select(raw_score__f_age) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9576 -0.3445 -0.1178  0.2532  3.7943 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.8451607  0.0153243  55.152  < 2e-16 ***
## p_age_first_offense    -0.0065319  0.0004888 -13.362  < 2e-16 ***
## p_juv_fel_count         0.1306356  0.0196060   6.663 2.84e-11 ***
## p_felprop_violarrest    0.1124162  0.0061483  18.284  < 2e-16 ***
## p_murder_arrest         0.1027832  0.0481811   2.133   0.0329 *  
## p_felassault_arrest     0.1810567  0.0114713  15.783  < 2e-16 ***
## p_misdemassault_arrest  0.1669203  0.0107051  15.593  < 2e-16 ***
## p_sex_arrest            0.1747182  0.0407972   4.283 1.87e-05 ***
## p_weapons_arrest        0.1033170  0.0158291   6.527 7.07e-11 ***
## p_n_on_probation        0.0787288  0.0051618  15.252  < 2e-16 ***
## p_current_on_probation  0.1522874  0.0260538   5.845 5.24e-09 ***
## p_prob_revoke           0.1443481  0.0195092   7.399 1.50e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5048 on 9030 degrees of freedom
## Multiple R-squared:   0.32,  Adjusted R-squared:  0.3191 
## F-statistic: 386.3 on 11 and 9030 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP

Model 2: xgboost

set.seed(46)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  15          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "1"         
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age

res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("Violent score - f(age)") +
  ylab("XGBoost prediction")+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(55656)

mdl_rf = randomForest(
  formula = raw_score__f_age ~ .,
  data = train
)

res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             1               
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "0.5"           
## gamma       "0.04166667"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 2 models: predicting (raw - age polynomial) without using age variables but with race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__f_age)


## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
  "label" = train %>% select(raw_score__f_age) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9759 -0.3321 -0.1098  0.2460  3.6383 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.6137104  0.0260561  23.553  < 2e-16 ***
## p_age_first_offense    -0.0044461  0.0004968  -8.949  < 2e-16 ***
## p_juv_fel_count         0.1212941  0.0192844   6.290 3.33e-10 ***
## p_felprop_violarrest    0.1088356  0.0060435  18.009  < 2e-16 ***
## p_murder_arrest         0.1060919  0.0473390   2.241    0.025 *  
## p_felassault_arrest     0.1767698  0.0112734  15.680  < 2e-16 ***
## p_misdemassault_arrest  0.1668764  0.0105170  15.867  < 2e-16 ***
## p_sex_arrest            0.1659144  0.0400788   4.140 3.51e-05 ***
## p_weapons_arrest        0.0929469  0.0155661   5.971 2.45e-09 ***
## p_n_on_probation        0.0736827  0.0050787  14.508  < 2e-16 ***
## p_current_on_probation  0.1424988  0.0256041   5.565 2.69e-08 ***
## p_prob_revoke           0.1351758  0.0191733   7.050 1.92e-12 ***
## race_black              0.2747839  0.0225159  12.204  < 2e-16 ***
## race_white              0.1209223  0.0227808   5.308 1.13e-07 ***
## race_hispanic           0.0267705  0.0273158   0.980    0.327    
## race_asian             -0.0244794  0.0753349  -0.325    0.745    
## race_native             0.1538104  0.0962585   1.598    0.110    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4958 on 9025 degrees of freedom
## Multiple R-squared:  0.3442, Adjusted R-squared:  0.3431 
## F-statistic: 296.1 on 16 and 9025 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP

Model 2: xgboost

set.seed(390)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age

res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("Violent score - f(age)") +
  ylab("XGBoost prediction")+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(2728)

mdl_rf = randomForest(
  formula = raw_score__f_age ~ .,
  data = train
)

res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             2               
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "0.5"           
## gamma       "0.02941176"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 3 models: predicting (raw - age polynomial) without using race but with age variables

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score__f_age)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
  "label" = train %>% select(raw_score__f_age) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8853 -0.3396 -0.1105  0.2422  3.8069 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.7750803  0.0158841  48.796  < 2e-16 ***
## p_current_age           0.0140701  0.0009606  14.646  < 2e-16 ***
## p_age_first_offense    -0.0195380  0.0010109 -19.327  < 2e-16 ***
## p_juv_fel_count         0.1216902  0.0193879   6.277 3.62e-10 ***
## p_felprop_violarrest    0.0902790  0.0062621  14.417  < 2e-16 ***
## p_murder_arrest         0.0807082  0.0476452   1.694  0.09031 .  
## p_felassault_arrest     0.1480263  0.0115601  12.805  < 2e-16 ***
## p_misdemassault_arrest  0.1516864  0.0106318  14.267  < 2e-16 ***
## p_sex_arrest            0.1227168  0.0404793   3.032  0.00244 ** 
## p_weapons_arrest        0.0757266  0.0157582   4.806 1.57e-06 ***
## p_n_on_probation        0.0694142  0.0051413  13.501  < 2e-16 ***
## p_current_on_probation  0.1673132  0.0257716   6.492 8.91e-11 ***
## p_prob_revoke           0.0562168  0.0201996   2.783  0.00540 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4989 on 9029 degrees of freedom
## Multiple R-squared:  0.3358, Adjusted R-squared:  0.3349 
## F-statistic: 380.3 on 12 and 9029 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP

Model 2: xgboost

set.seed(34)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age

res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(Violent~score~-~f[viol~age])) +
  ylab("XGBoost prediction") +
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw-fage_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(7872)

mdl_rf = randomForest(
  formula = raw_score__f_age ~ .,
  data = train
)

res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             11              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.07692308"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 4 models: predicting (raw - age polynomial) using age variables and race

### Create group 4 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__f_age)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__f_age) %>% as.matrix(),
  "label" = train %>% select(raw_score__f_age) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__f_age ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__f_age ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9064 -0.3264 -0.1039  0.2364  3.6516 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.5544869  0.0260945  21.249  < 2e-16 ***
## p_current_age           0.0135686  0.0009462  14.340  < 2e-16 ***
## p_age_first_offense    -0.0169983  0.0010038 -16.934  < 2e-16 ***
## p_juv_fel_count         0.1127517  0.0190788   5.910 3.55e-09 ***
## p_felprop_violarrest    0.0876690  0.0061557  14.242  < 2e-16 ***
## p_murder_arrest         0.0839132  0.0468369   1.792  0.07323 .  
## p_felassault_arrest     0.1451296  0.0113640  12.771  < 2e-16 ***
## p_misdemassault_arrest  0.1521143  0.0104506  14.556  < 2e-16 ***
## p_sex_arrest            0.1160088  0.0397845   2.916  0.00356 ** 
## p_weapons_arrest        0.0663196  0.0155042   4.278 1.91e-05 ***
## p_n_on_probation        0.0647496  0.0050606  12.795  < 2e-16 ***
## p_current_on_probation  0.1570690  0.0253391   6.199 5.94e-10 ***
## p_prob_revoke           0.0506041  0.0198557   2.549  0.01083 *  
## race_black              0.2658235  0.0222736  11.934  < 2e-16 ***
## race_white              0.1089137  0.0225424   4.831 1.38e-06 ***
## race_hispanic           0.0308834  0.0270128   1.143  0.25295    
## race_asian             -0.0139690  0.0744987  -0.188  0.85127    
## race_native             0.1331917  0.0951964   1.399  0.16181    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4903 on 9024 degrees of freedom
## Multiple R-squared:  0.3589, Adjusted R-squared:  0.3576 
## F-statistic: 297.1 on 17 and 9024 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__f_age) # ADJUST GROUP

Model 2: xgboost

set.seed(11)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  6           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age

res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(Violent~score~-~f[viol~age])) +
  ylab("XGBoost prediction")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12))

ggsave("Figures/raw-fage_xgboost_gp4_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

highlight = data.frame(
  person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
  screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
  highlight = TRUE
)

df_plot = features_filt %>%
  bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
  left_join(highlight, by = c("person_id","screening_date")) %>%
  mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
  mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))

person_id_text_topright = c(8491, 8375, 1497)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(5722, 11231)
person_id_text_botright = c(799, 11312, 1284, 11414)
person_id_text_botleft = c()

ggplot() +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  data = filter(df_plot, highlight=="In Table 5")) +
  theme_bw()+
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) + 
  xlab(expression(XGBoost~pred.~of~violent~score~-~f[age])) +
  ylab("Violent score")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12),
    #legend.position = "top",
    legend.position="none") +
  scale_color_discrete(name = element_blank())

ggsave("Figures/xgboost_rawScore_annotate_violent.pdf",width = 3, height = 3, units = "in")

Model 3: random forest

set.seed(379)

mdl_rf = randomForest(
  formula = raw_score__f_age ~ .,
  data = train
)

res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__f_age) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__f_age ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             10              
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.05555556"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__f_age) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Comparison

knitr::kable(res_rmse)
Group lm xgb rf svm
1 0.5044421 0.4831389 0.4950131 0.4915281
2 0.4953577 0.4699155 0.4878036 0.4812358
3 0.4985543 0.4682046 0.4930407 0.4757220
4 0.4898085 0.4565221 0.4827517 0.4740605

Predictions using xgboost only

We use the “Group 3” models but predict the raw score and the raw score minus all of the lower bounds we have fitted. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound only.

Predicting the raw score

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score) %>% as.matrix(),
  "label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(347)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score

print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.465"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("Violent score") +
  ylab("XGBoost prediction")+
  #annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")

Predicting the raw score - f(age) - g(vio_hist)

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score__f_age__g_vio_hist)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__f_age__g_vio_hist) %>% as.matrix(),
  "label" = train %>% select(raw_score__f_age__g_vio_hist) %>% as.matrix()
)
set.seed(7301)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  13          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__f_age__g_vio_hist

print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4691"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist])) +
  ylab("XGBoost prediction")+
  #annotate("text", x = 0.5, y = 4, label = paste("RMSE:",round(rmse(pred, actual),4)))+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw-fage-gviohist_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")

Replicating ProPublica logistic regression

propub = features_filt %>%
  filter(filt4) %>% # Only people with valid recidivism values
  mutate(age_low = if_else(p_current_age < 25,1,0), 
         age_high = if_else(p_current_age > 45,1,0),
         female = if_else(sex=="Female",1,0),
         n_priors = p_felony_count_person + p_misdem_count_person,
         compas_high = if_else(`Risk of Violence_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
         race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis

print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 5759"
mdl_glm = glm(compas_high ~
                female +
                age_high +
                age_low +
                as.factor(race) +
                p_charge +
                is_misdem +
                recid_violent,
              family=binomial(link='logit'), data=propub)

summary(mdl_glm)
## 
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) + 
##     p_charge + is_misdem + recid_violent, family = binomial(link = "logit"), 
##     data = propub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6118  -0.6001  -0.3213   0.7395   2.9655  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -2.368882   0.094675 -25.021  < 2e-16 ***
## female                          -0.651197   0.099532  -6.543 6.05e-11 ***
## age_high                        -1.727458   0.174615  -9.893  < 2e-16 ***
## age_low                          2.593419   0.079247  32.726  < 2e-16 ***
## as.factor(race)African-American  0.745976   0.083088   8.978  < 2e-16 ***
## as.factor(race)Asian            -0.765381   0.678659  -1.128    0.259    
## as.factor(race)Hispanic          0.005838   0.146006   0.040    0.968    
## as.factor(race)Native American   0.503070   0.740727   0.679    0.497    
## as.factor(race)Other            -0.039456   0.170241  -0.232    0.817    
## p_charge                         0.074006   0.004375  16.915  < 2e-16 ***
## is_misdem                       -0.362223   0.077149  -4.695 2.67e-06 ***
## recid_violent                    0.704532   0.093094   7.568 3.79e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7378.0  on 5758  degrees of freedom
## Residual deviance: 4934.2  on 5747  degrees of freedom
## AIC: 4958.2
## 
## Number of Fisher Scoring iterations: 6